STATS 762: Assignment 4

Author: Tarin Eccleston

Appendix

Evaluation Function: Computes classification performance estimates avg accuracy and F1-score

measure.classification = function(class.pred, class.obs, conf.mx){
  entropy = 0 
 # for(i in levels(class.pred)){
  #  entropy = entropy-sum(log(prob.class[class.obs==i,colnames(prob.class)==i])) }
  avg.accuracy=0
  for (i in 1:ncol(conf.mx)){
    avg.accuracy = avg.accuracy + (conf.mx[i,i]+sum(conf.mx[-i,-i]))/sum(conf.mx)
  }
  avg.accuracy=avg.accuracy/i
  precision = mean(diag(conf.mx)/rowSums(conf.mx))
  recall = mean(diag(conf.mx)/colSums(conf.mx))
  F1.score = 2*precision*recall/(precision+recall)
  return(data.frame(avg.accuracy=avg.accuracy,F1.score=F1.score))
}

Cardiovascular diseases (CVDs) are one of major causes of death glob-

ally. About 17.9 million lives die each year and this is about 31% of all deaths worldwide. Heart failure is a common event caused by CVDs and the dataset train.csv contains 10 features that can be used to predict mortality by heart failure.

setwd("/Users/tarineccleston/Documents/Masters/STATS 762/regression-for-DS/disease-and-weather/")
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()  masks randomForest::combine()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ ggplot2::margin() masks randomForest::margin()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(MASS) 
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(klaR)
library(e1071)

library(lubridate)

library(splines)

1a) Which explanatory variables are nominal? Which explanatory vari-

ables are numeric? Identify them and modify data to read explana- tory variables correctly.

cvd_df = read_csv("data/train.csv")
## Rows: 299 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): age, anaemia, creatinine_phosphokinase, diabetes, ejection_fractio...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(cvd_df)
## Rows: 299
## Columns: 11
## $ age                      <dbl> 75, 55, 65, 50, 65, 90, 75, 60, 65, 80, 75, 6…
## $ anaemia                  <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, …
## $ creatinine_phosphokinase <dbl> 582, 7861, 146, 111, 160, 47, 246, 315, 157, …
## $ diabetes                 <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ ejection_fraction        <dbl> 20, 38, 20, 20, 20, 40, 15, 60, 65, 35, 38, 2…
## $ high_blood_pressure      <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ platelets                <dbl> 265000, 263358, 162000, 210000, 327000, 20400…
## $ serum_sodium             <dbl> 130, 136, 129, 137, 116, 132, 137, 131, 138, …
## $ sex                      <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, …
## $ smoking                  <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, …
## $ death                    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …

Nominal: anaemia, diabetes, high_blood_pressure, sex, smoking, death Numeric: age, creatinine_phosphokinase, ejection_fraction, platelets, serum_sodium

# force to factor
cvd_df$death = as.factor(cvd_df$death)
cvd_df$anaemia = as.factor(cvd_df$anaemia)
cvd_df$diabetes = as.factor(cvd_df$diabetes)
cvd_df$high_blood_pressure = as.factor(cvd_df$high_blood_pressure)
cvd_df$sex = as.factor(cvd_df$sex)
cvd_df$smoking = as.factor(cvd_df$smoking)

1b) Fit an appropriate regression model using all explanatory variables.

Write the reason for your model choice.

NOTE: d) find best classifier NOTE: Compare the best between b and d NOTE: Make stronger comparison between d and b NOTE: Maybe use age and ejection fraction one the best model in d and then compare NOTE: Find best model in b then find the best model in d and compare NOTE: Fix scaling

# use 80/20 train/test split for an unbiased estimate
index = sample(nrow(cvd_df), as.integer(nrow(cvd_df)*0.8), replace = FALSE)
cvd_train_df = cvd_df[index, ]
cvd_test_df = cvd_df[-index, ]

# fit model
cvd_log_fit = glm(death ~ ., data = cvd_train_df, family = 'binomial')
Logistic Regression
death_probs = data.frame(probs = predict(cvd_log_fit, cvd_test_df, type="response"))
death_pred_logistic = ifelse(death_probs > 0.5, 1, 0)

conf_mx_logistic = table(death_pred_logistic, cvd_test_df$death)
conf_mx_logistic
##                    
## death_pred_logistic  0  1
##                   0 35 17
##                   1  4  4
metric_comparison = data.frame()

metrics_logistic = measure.classification(death_pred_logistic, cvd_test_df$death, conf_mx_logistic)
rownames(metrics_logistic) = "Logistic Regression"

metric_comparison = rbind(metric_comparison, metrics_logistic)

Fit logistic regression first as this is the simplest classification model and can be used as a benchmark. We are predicting the likelihood of death (0 or 1) given all other explanatory variables. Probabilities above 0.5 result in death, otherwise anything below 0.5 will result in living.

Decision Tree

cvd_dt_fit <- rpart(death ~ ., data = cvd_train_df, method = "class", cp=0.001)
cvd_dt_fit$cptable
##            CP nsplit rel error    xerror       xstd
## 1 0.200000000      0 1.0000000 1.0000000 0.09565162
## 2 0.120000000      1 0.8000000 0.8266667 0.09034877
## 3 0.026666667      2 0.6800000 0.6933333 0.08504764
## 4 0.008888889      5 0.6000000 0.7866667 0.08887959
## 5 0.001000000      8 0.5733333 0.8533333 0.09127436

Tree No. 4 provides the best accuracy with a dip in xerror. Find the parsimonious model by adding the standard error, resulting in tree no. 3 with 2 splits.

cp.1se= max(cvd_dt_fit$cptable[cvd_dt_fit$cptable[,4]<sum(cvd_dt_fit$cptable[which.min(cvd_dt_fit$cptable[,4]),c(4,5)]),1])
cp.1se
## [1] 0.02666667
cvd_dt_prune_fit <-prune(cvd_dt_fit, cp = cp.1se)
cvd_dt_prune_fit
## n= 239 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 239 75 0 (0.6861925 0.3138075)  
##   2) ejection_fraction>=22.5 220 58 0 (0.7363636 0.2636364)  
##     4) age< 79 199 43 0 (0.7839196 0.2160804) *
##     5) age>=79 21  6 1 (0.2857143 0.7142857) *
##   3) ejection_fraction< 22.5 19  2 1 (0.1052632 0.8947368) *
death_pred_dt_pruned = predict(cvd_dt_prune_fit, cvd_test_df, type="class")

conf_mx_dt_pruned = table(death_pred_dt_pruned, cvd_test_df$death)
conf_mx_dt_pruned
##                     
## death_pred_dt_pruned  0  1
##                    0 36 16
##                    1  3  5
metrics_dt = measure.classification(death_pred_dt_pruned, cvd_test_df$death, conf_mx_dt_pruned)
rownames(metrics_dt) = "Decision Tree"

metric_comparison = rbind(metric_comparison, metrics_dt)

Decision Tree performs worse. This could be due to use discretion of the decision space, which could result in a loss of information from other variables.

Random Forest

cvd_rf_fit = randomForest(death ~ ., data = cvd_train_df, importance=TRUE)
plot(cvd_rf_fit)

death_rf_pred = predict(cvd_dt_fit, cvd_test_df, type="class")

conf_mx_rf = table(death_rf_pred, cvd_test_df$death)
conf_mx_rf
##              
## death_rf_pred  0  1
##             0 30 11
##             1  9 10
metrics_rf = measure.classification(death_rf_pred, cvd_test_df$death, conf_mx_rf)
rownames(metrics_rf) = "Random Forest"

metric_comparison = rbind(metric_comparison, metrics_rf)

1c) In your final model in (b), which variables are useful in predicting

death due to heart failure? List the most important two variables

summary(cvd_log_fit)
## 
## Call:
## glm(formula = death ~ ., family = "binomial", data = cvd_train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7018  -0.8040  -0.4427   0.8417   2.6494  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               8.432e+00  4.998e+00   1.687   0.0916 .  
## age                       6.320e-02  1.454e-02   4.346 1.38e-05 ***
## anaemia1                  8.662e-02  3.319e-01   0.261   0.7941    
## creatinine_phosphokinase  2.876e-04  1.539e-04   1.868   0.0618 .  
## diabetes1                 3.556e-01  3.324e-01   1.070   0.2847    
## ejection_fraction        -8.174e-02  1.841e-02  -4.439 9.04e-06 ***
## high_blood_pressure1      4.470e-01  3.368e-01   1.327   0.1845    
## platelets                -1.264e-06  1.785e-06  -0.708   0.4790    
## serum_sodium             -7.550e-02  3.671e-02  -2.056   0.0397 *  
## sex1                     -7.290e-02  3.882e-01  -0.188   0.8510    
## smoking1                 -3.178e-01  3.865e-01  -0.822   0.4110    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 297.37  on 238  degrees of freedom
## Residual deviance: 238.53  on 228  degrees of freedom
## AIC: 260.53
## 
## Number of Fisher Scoring iterations: 5

Age and ejection fraction appear to have the smallest p-values in the summary table, they appear to be the two most useful variables to predict death due to heart failure. However we cannot rely on this statistic alone, so we use variable important from the decision tree.

dt_important = data.frame(importance = cvd_dt_fit$variable.importance)
ranking = dt_important %>% 
  tibble::rownames_to_column() %>% 
  dplyr::rename("variable" = rowname) %>% 
  dplyr::arrange(importance) %>%
  dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(ranking) +
  geom_col(aes(x = variable, y = importance),
           col = "black", show.legend = F) +
  coord_flip() +
  scale_fill_grey() +
  theme_bw()

A decision tree works by selection the variable and decision boundary would would provide the greater information gain in separating classes death (1) and not death (0). Variables of higher importance are use for splits higher up on the decision tree. Looking at the variable importance graph, we can confirm that ejection fraction and age are the two most important variables to predict death caused by cvd.

1d) We have learnt various classification methods and, some classifiers are

only applicable to numeric explanatory variables. If we consider only numeric explanatory variables, do we get a better classifer? Compare classifiers using the same validation method in (b) and answer the question

Use now only age, creatinine_phosphokinase, ejection_fraction, platelets, serum_sodium in our model. We can now use LDA, QDA and SVM on-top of decision trees, random forests and boosting methods. Let’s compare their relative performances using the aforementioned evaluation metrics. Let’s do an EDA first…

Exploratory Data Analysis

# standardise covariates for the whole dataset for the EDA
exclude_cols <- c("death", "anaemia", "diabetes", "high_blood_pressure", "sex", "smoking")
cvd_df[, -which(names(cvd_df) %in% exclude_cols)] <- scale(cvd_df[, -which(names(cvd_df) %in% exclude_cols)])

legend_colors = c("red", "blue")
{
pairs(cvd_df[c("age", "creatinine_phosphokinase", "ejection_fraction", "platelets", "serum_sodium")],
      col = ifelse(cvd_df$death == 1, "red", "blue"),
      main = "Pairs Plot of Explanatory Variables and Death")

legend("topright", legend = c("Death", "Survived"), col = legend_colors, pch = 1)
}

class_freq = table(cvd_df$death)

class_labels = recode(names(class_freq), "1" = "death", "0" = "survived")

ggplot(data.frame(Class = class_labels, Frequency = as.vector(class_freq)), aes(x = Class, y = Frequency, fill = Class)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("red", "blue")) +
  xlab("Class") +
  ylab("Frequency") +
  ggtitle("Class Imbalance in the Dataset")

  • There appears to be a somewhat clear separation vertical linear boundary between death and survival at the lower end of ejection fraction compared to all other explanatory variables.

  • Observations with a greater age appear to have a higher chance of dying due to CVD.

  • Platelets and serum sodium vs other explanatory variables appear to have a appear to have slight quadratic decision boundaries between death and survival.

  • No notable blobs in the pair plot.

  • No distinguishable distribution or clusters between death and surviving classes for all pairs of explanatory variables.

  • There appears to be a class imbalance between survived and dead observations, with the survived class having twice more observations.

LDA

cvd_lda_fit = lda(death ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, cvd_train_df)
cvd_lda_fit$means
##        age creatinine_phosphokinase ejection_fraction platelets serum_sodium
## 0 59.01626                 545.1159          40.14024  268394.8     137.1707
## 1 65.50223                 745.8267          32.73333  254261.1     134.9733
# plot boundaries
partimat(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, method = "lda")

#prediction 
death_lda_pred = predict(cvd_lda_fit, cvd_test_df)
death_lda_pred = as.data.frame(death_lda_pred)

#confusion matrix
lda_mx <- table(death_lda_pred$class, cvd_test_df$death)
lda_mx
##    
##      0  1
##   0 37 16
##   1  2  5
# metrics_lda = measure.classification(death_lda_pred$class, death_lda_pred$posterior.1, cvd_test_df$death, lda_mx)
metrics_lda = measure.classification(death_lda_pred$class, cvd_test_df$death, lda_mx)
rownames(metrics_lda) = "LDA"

metric_comparison = rbind(metric_comparison, metrics_lda)

QDA

cvd_qda_fit = qda(death ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, cvd_train_df)
cvd_qda_fit$means
##        age creatinine_phosphokinase ejection_fraction platelets serum_sodium
## 0 59.01626                 545.1159          40.14024  268394.8     137.1707
## 1 65.50223                 745.8267          32.73333  254261.1     134.9733
# plot boundaries
partimat(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, method = "qda")

#prediction 
death_qda_pred = predict(cvd_qda_fit, cvd_test_df)
death_qda_pred = as.data.frame(death_qda_pred)

#confusion matrix
qda_mx <- table(death_qda_pred$class, cvd_test_df$death)
qda_mx
##    
##      0  1
##   0 38 19
##   1  1  2
# metrics_qda = measure.classification(death_qda_pred$class, death_qda_pred$posterior.1, cvd_test_df$death, qda_mx)
metrics_qda = measure.classification(death_qda_pred$class, cvd_test_df$death, qda_mx)
rownames(metrics_qda) = "QDA"

metric_comparison = rbind(metric_comparison, metrics_qda)

SVM (Linear Kernel)

cvd_svm_linear_fit <- svm(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, kernel = 'linear', probability = TRUE)

# most interesting plots
{
plot(cvd_svm_linear_fit, cvd_train_df, age ~ ejection_fraction)
plot(cvd_svm_linear_fit, cvd_train_df, age ~ creatinine_phosphokinase)
plot(cvd_svm_linear_fit, cvd_train_df, age ~ platelets)
plot(cvd_svm_linear_fit, cvd_train_df, age ~ serum_sodium)
plot(cvd_svm_linear_fit, cvd_train_df, creatinine_phosphokinase ~ ejection_fraction)
plot(cvd_svm_linear_fit, cvd_train_df, ejection_fraction ~ serum_sodium)
}

Age vs Ejection Fraction shows the strongest decision boundary to distinguish the positive and negative classes in absence of a distribution.

death_svm_linear_pred = predict(cvd_svm_linear_fit, cvd_test_df)

svm_linear_mx = table(death_svm_linear_pred, cvd_test_df$death)
svm_linear_mx
##                      
## death_svm_linear_pred  0  1
##                     0 36 14
##                     1  3  7
metrics_svm_linear = measure.classification(death_svm_linear_pred, cvd_test_df$death, svm_linear_mx)
rownames(metrics_svm_linear) = "SVM (Linear Kernel)"

metric_comparison = rbind(metric_comparison, metrics_svm_linear)

SVM (Polynomial Kernel)

cvd_svm_poly_fit = svm(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, kernel = 'polynomial', probability = TRUE)

{
plot(cvd_svm_poly_fit, cvd_train_df, age ~ ejection_fraction)
plot(cvd_svm_poly_fit, cvd_train_df, age ~ creatinine_phosphokinase)
plot(cvd_svm_poly_fit, cvd_train_df, age ~ platelets)
plot(cvd_svm_poly_fit, cvd_train_df, age ~ serum_sodium)
plot(cvd_svm_poly_fit, cvd_train_df, creatinine_phosphokinase ~ ejection_fraction)
plot(cvd_svm_poly_fit, cvd_train_df, ejection_fraction ~ serum_sodium)
}

death_svm_poly_pred = predict(cvd_svm_poly_fit, cvd_test_df)

svm_poly_mx = table(death_svm_poly_pred, cvd_test_df$death)
svm_poly_mx
##                    
## death_svm_poly_pred  0  1
##                   0 38 19
##                   1  1  2
metrics_svm_poly = measure.classification(death_svm_poly_pred, cvd_test_df$death, svm_poly_mx)
rownames(metrics_svm_poly) = "SVM (Polynomial Kernel)"

metric_comparison = rbind(metric_comparison, metrics_svm_poly)

SVM (Radial Kernel)

cvd_svm_rad_fit = svm(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, kernel = 'radial', probability = TRUE)

{
plot(cvd_svm_rad_fit, cvd_train_df, age ~ ejection_fraction)
plot(cvd_svm_rad_fit, cvd_train_df, age ~ creatinine_phosphokinase)
plot(cvd_svm_rad_fit, cvd_train_df, age ~ platelets)
plot(cvd_svm_rad_fit, cvd_train_df, age ~ serum_sodium)
plot(cvd_svm_rad_fit, cvd_train_df, creatinine_phosphokinase ~ ejection_fraction)
plot(cvd_svm_rad_fit, cvd_train_df, ejection_fraction ~ serum_sodium)
}

death_svm_rad_pred = predict(cvd_svm_rad_fit, cvd_test_df)

svm_rad_mx = table(death_svm_rad_pred, cvd_test_df$death)
svm_rad_mx
##                   
## death_svm_rad_pred  0  1
##                  0 37 18
##                  1  2  3
metrics_svm_rad = measure.classification(death_svm_rad_pred, cvd_test_df$death, svm_rad_mx)
rownames(metrics_svm_rad) = "SVM (Radial Kernel)"

metric_comparison = rbind(metric_comparison, metrics_svm_rad)

Decision Tree

Random Forest

Boosting

Comparisons

!!! WRITE DISCUSSION HERE !!!

The spreadsheet AckTemp.csv contain daily maximum temperature in

Auckland from 1 Jan 2014 to 30 June 2020. The attributes follow;

Feature Description

year; 2014, …, 2020 month; 1,2,…,12 day; 1,2,… maxT; (Maximum daily temperature in Celsius) 3, 6.7, 4.8 … 6.7

Key objects of daily maximum temperature series are estimated naively

and are presented graphically. We use this as prestudy to model the long term behaviour of daily max temperature.

From the prestudy (above figure), both seasonality and non-linear trend

are observed. We will treat month as a factor variable and assume fixed month effect.

year and day are not sufficient to parametrize the long-term trend and,

we will introduce the new variable y which it is a vector of cumulative days from 1 Jan 2014. For example, y=1 on 1 Jan 2014, …, y=31 on 31 Jan 2014, …, y=2283 on 30 June 2020.

2a) Create the new data in which there are the three variables, y, month

and maxT

akl_weather_df = read_csv("data/AckTemp.csv")
## Rows: 2283 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): year, month, day, maxT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(akl_weather_df)
## Rows: 2,283
## Columns: 4
## $ year  <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014…
## $ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 2…
## $ maxT  <dbl> 21.9, 23.8, 25.6, 24.2, 23.7, 26.0, 22.2, 24.1, 21.9, 22.0, 22.3…
akl_weather_df$datetime <- ymd(paste(akl_weather_df$year, akl_weather_df$month, akl_weather_df$day, sep = "-"))
reference_datetime <- ymd("2014-01-01")

akl_weather_df = akl_weather_df %>%
  mutate(y = as.numeric(difftime(akl_weather_df$datetime, reference_datetime, units = "days")) + 1)

akl_weather_df = akl_weather_df[,c("y", "month", "maxT")]
akl_weather_df$month = as.factor(akl_weather_df$month)

glimpse(akl_weather_df)
## Rows: 2,283
## Columns: 3
## $ y     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 2…
## $ month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ maxT  <dbl> 21.9, 23.8, 25.6, 24.2, 23.7, 26.0, 22.2, 24.1, 21.9, 22.0, 22.3…

2b) Find an appropriate non-parametric model for daily maximum tem-

perature using the data in (a). Write reason(s) for your choice of model and degrees of freedom.

plot(maxT ~ y, data = akl_weather_df, xlab = "Days Since 1-1-2014 (Days)", ylab = "Maximum Temperature (C)", main = "Maximum Temperatures in Auckland (1-1-2014 - 30-6-2020)")

There is seasonality and trend on inspection. The seasonality is based on seasons in New Zealand, so we would expect a cycle period of 365 days (1 year).

NOTE: Use CV NOTE: create MSE plot Maybe choose parsim model?